1 package org.apache.lucene.analysis.pt;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.io.InputStreamReader;
23 import java.io.LineNumberReader;
24 import java.nio.charset.StandardCharsets;
25 import java.util.ArrayList;
26 import java.util.Arrays;
27 import java.util.HashMap;
28 import java.util.List;
29 import java.util.Map;
30 import java.util.regex.Matcher;
31 import java.util.regex.Pattern;
32
33 import org.apache.lucene.analysis.util.CharArraySet;
34
35 import static org.apache.lucene.analysis.util.StemmerUtil.*;
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84 public abstract class RSLPStemmerBase {
85
86
87
88
89 protected static class Rule {
90 protected final char suffix[];
91 protected final char replacement[];
92 protected final int min;
93
94
95
96
97
98
99
100 public Rule(String suffix, int min, String replacement) {
101 this.suffix = suffix.toCharArray();
102 this.replacement = replacement.toCharArray();
103 this.min = min;
104 }
105
106
107
108
109 public boolean matches(char s[], int len) {
110 return (len - suffix.length >= min && endsWith(s, len, suffix));
111 }
112
113
114
115
116 public int replace(char s[], int len) {
117 if (replacement.length > 0) {
118 System.arraycopy(replacement, 0, s, len - suffix.length, replacement.length);
119 }
120 return len - suffix.length + replacement.length;
121 }
122 }
123
124
125
126
127 protected static class RuleWithSetExceptions extends Rule {
128 protected final CharArraySet exceptions;
129
130 public RuleWithSetExceptions(String suffix, int min, String replacement,
131 String[] exceptions) {
132 super(suffix, min, replacement);
133 for (int i = 0; i < exceptions.length; i++) {
134 if (!exceptions[i].endsWith(suffix))
135 throw new RuntimeException("useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
136 }
137 this.exceptions = new CharArraySet(Arrays.asList(exceptions), false);
138 }
139
140 @Override
141 public boolean matches(char s[], int len) {
142 return super.matches(s, len) && !exceptions.contains(s, 0, len);
143 }
144 }
145
146
147
148
149 protected static class RuleWithSuffixExceptions extends Rule {
150
151 protected final char[][] exceptions;
152
153 public RuleWithSuffixExceptions(String suffix, int min, String replacement,
154 String[] exceptions) {
155 super(suffix, min, replacement);
156 for (int i = 0; i < exceptions.length; i++) {
157 if (!exceptions[i].endsWith(suffix))
158 throw new RuntimeException("warning: useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
159 }
160 this.exceptions = new char[exceptions.length][];
161 for (int i = 0; i < exceptions.length; i++)
162 this.exceptions[i] = exceptions[i].toCharArray();
163 }
164
165 @Override
166 public boolean matches(char s[], int len) {
167 if (!super.matches(s, len))
168 return false;
169
170 for (int i = 0; i < exceptions.length; i++)
171 if (endsWith(s, len, exceptions[i]))
172 return false;
173
174 return true;
175 }
176 }
177
178
179
180
181 protected static class Step {
182 protected final String name;
183 protected final Rule rules[];
184 protected final int min;
185 protected final char[][] suffixes;
186
187
188
189
190
191
192
193
194 public Step(String name, Rule rules[], int min, String suffixes[]) {
195 this.name = name;
196 this.rules = rules;
197 if (min == 0) {
198 min = Integer.MAX_VALUE;
199 for (Rule r : rules)
200 min = Math.min(min, r.min + r.suffix.length);
201 }
202 this.min = min;
203
204 if (suffixes == null || suffixes.length == 0) {
205 this.suffixes = null;
206 } else {
207 this.suffixes = new char[suffixes.length][];
208 for (int i = 0; i < suffixes.length; i++)
209 this.suffixes[i] = suffixes[i].toCharArray();
210 }
211 }
212
213
214
215
216 public int apply(char s[], int len) {
217 if (len < min)
218 return len;
219
220 if (suffixes != null) {
221 boolean found = false;
222
223 for (int i = 0; i < suffixes.length; i++)
224 if (endsWith(s, len, suffixes[i])) {
225 found = true;
226 break;
227 }
228
229 if (!found) return len;
230 }
231
232 for (int i = 0; i < rules.length; i++) {
233 if (rules[i].matches(s, len))
234 return rules[i].replace(s, len);
235 }
236
237 return len;
238 }
239 }
240
241
242
243
244
245 protected static Map<String,Step> parse(Class<? extends RSLPStemmerBase> clazz, String resource) {
246
247 try {
248 InputStream is = clazz.getResourceAsStream(resource);
249 LineNumberReader r = new LineNumberReader(new InputStreamReader(is, StandardCharsets.UTF_8));
250 Map<String,Step> steps = new HashMap<>();
251 String step;
252 while ((step = readLine(r)) != null) {
253 Step s = parseStep(r, step);
254 steps.put(s.name, s);
255 }
256 r.close();
257 return steps;
258 } catch (IOException e) {
259 throw new RuntimeException(e);
260 }
261 }
262
263 private static final Pattern headerPattern =
264 Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*(0|1),\\s*\\{(.*)\\},\\s*$");
265 private static final Pattern stripPattern =
266 Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+)\\s*\\}\\s*(,|(\\}\\s*;))$");
267 private static final Pattern repPattern =
268 Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*\"([^\"]*)\"\\}\\s*(,|(\\}\\s*;))$");
269 private static final Pattern excPattern =
270 Pattern.compile("^\\{\\s*\"([^\"]*)\",\\s*([0-9]+),\\s*\"([^\"]*)\",\\s*\\{(.*)\\}\\s*\\}\\s*(,|(\\}\\s*;))$");
271
272 private static Step parseStep(LineNumberReader r, String header) throws IOException {
273 Matcher matcher = headerPattern.matcher(header);
274 if (!matcher.find()) {
275 throw new RuntimeException("Illegal Step header specified at line " + r.getLineNumber());
276 }
277 assert matcher.groupCount() == 4;
278 String name = matcher.group(1);
279 int min = Integer.parseInt(matcher.group(2));
280 int type = Integer.parseInt(matcher.group(3));
281 String suffixes[] = parseList(matcher.group(4));
282 Rule rules[] = parseRules(r, type);
283 return new Step(name, rules, min, suffixes);
284 }
285
286 private static Rule[] parseRules(LineNumberReader r, int type) throws IOException {
287 List<Rule> rules = new ArrayList<>();
288 String line;
289 while ((line = readLine(r)) != null) {
290 Matcher matcher = stripPattern.matcher(line);
291 if (matcher.matches()) {
292 rules.add(new Rule(matcher.group(1), Integer.parseInt(matcher.group(2)), ""));
293 } else {
294 matcher = repPattern.matcher(line);
295 if (matcher.matches()) {
296 rules.add(new Rule(matcher.group(1), Integer.parseInt(matcher.group(2)), matcher.group(3)));
297 } else {
298 matcher = excPattern.matcher(line);
299 if (matcher.matches()) {
300 if (type == 0) {
301 rules.add(new RuleWithSuffixExceptions(matcher.group(1),
302 Integer.parseInt(matcher.group(2)),
303 matcher.group(3),
304 parseList(matcher.group(4))));
305 } else {
306 rules.add(new RuleWithSetExceptions(matcher.group(1),
307 Integer.parseInt(matcher.group(2)),
308 matcher.group(3),
309 parseList(matcher.group(4))));
310 }
311 } else {
312 throw new RuntimeException("Illegal Step rule specified at line " + r.getLineNumber());
313 }
314 }
315 }
316 if (line.endsWith(";"))
317 return rules.toArray(new Rule[rules.size()]);
318 }
319 return null;
320 }
321
322 private static String[] parseList(String s) {
323 if (s.length() == 0)
324 return null;
325 String list[] = s.split(",");
326 for (int i = 0; i < list.length; i++)
327 list[i] = parseString(list[i].trim());
328 return list;
329 }
330
331 private static String parseString(String s) {
332 return s.substring(1, s.length()-1);
333 }
334
335 private static String readLine(LineNumberReader r) throws IOException {
336 String line = null;
337 while ((line = r.readLine()) != null) {
338 line = line.trim();
339 if (line.length() > 0 && line.charAt(0) != '#')
340 return line;
341 }
342 return line;
343 }
344 }